"""
Tests for URLValidator security module.
"""

import pytest

from src.local_deep_research.security.url_validator import (
    URLValidator,
    URLValidationError,
)


class TestIsUnsafeScheme:
    """Tests for URLValidator.is_unsafe_scheme()."""

    def test_javascript_scheme(self):
        """Detects javascript: as unsafe."""
        assert URLValidator.is_unsafe_scheme("javascript:alert('xss')") is True

    def test_javascript_scheme_uppercase(self):
        """Detects JAVASCRIPT: (case-insensitive)."""
        assert URLValidator.is_unsafe_scheme("JAVASCRIPT:alert('xss')") is True

    def test_javascript_scheme_mixed_case(self):
        """Detects JavaScript: (mixed case)."""
        assert URLValidator.is_unsafe_scheme("JavaScript:void(0)") is True

    def test_data_scheme(self):
        """Detects data: as unsafe."""
        assert (
            URLValidator.is_unsafe_scheme(
                "data:text/html,<script>alert('xss')</script>"
            )
            is True
        )

    def test_vbscript_scheme(self):
        """Detects vbscript: as unsafe."""
        assert URLValidator.is_unsafe_scheme("vbscript:msgbox('xss')") is True

    def test_about_scheme(self):
        """Detects about: as unsafe."""
        assert URLValidator.is_unsafe_scheme("about:blank") is True

    def test_blob_scheme(self):
        """Detects blob: as unsafe."""
        assert (
            URLValidator.is_unsafe_scheme("blob:https://example.com/uuid")
            is True
        )

    def test_file_scheme(self):
        """Detects file: as unsafe."""
        assert URLValidator.is_unsafe_scheme("file:///etc/passwd") is True

    def test_http_scheme_is_safe(self):
        """HTTP is not marked as unsafe scheme."""
        assert URLValidator.is_unsafe_scheme("http://example.com") is False

    def test_https_scheme_is_safe(self):
        """HTTPS is not marked as unsafe scheme."""
        assert URLValidator.is_unsafe_scheme("https://example.com") is False

    def test_empty_url(self):
        """Empty URL returns False (not unsafe)."""
        assert URLValidator.is_unsafe_scheme("") is False

    def test_none_url(self):
        """None URL returns False (not unsafe)."""
        assert URLValidator.is_unsafe_scheme(None) is False

    def test_whitespace_handling(self):
        """Handles leading/trailing whitespace."""
        assert (
            URLValidator.is_unsafe_scheme("  javascript:alert('xss')  ") is True
        )


class TestIsSafeUrl:
    """Tests for URLValidator.is_safe_url()."""

    def test_valid_https_url(self):
        """Valid HTTPS URL is safe."""
        assert URLValidator.is_safe_url("https://example.com") is True

    def test_valid_http_url(self):
        """Valid HTTP URL is safe."""
        assert URLValidator.is_safe_url("http://example.com") is True

    def test_valid_https_with_path(self):
        """HTTPS with path is safe."""
        assert (
            URLValidator.is_safe_url("https://example.com/path/to/page") is True
        )

    def test_valid_https_with_query(self):
        """HTTPS with query params is safe."""
        assert (
            URLValidator.is_safe_url("https://example.com/page?q=test&page=1")
            is True
        )

    def test_javascript_url_is_not_safe(self):
        """JavaScript URL is not safe."""
        assert URLValidator.is_safe_url("javascript:alert('xss')") is False

    def test_data_url_is_not_safe(self):
        """Data URL is not safe."""
        assert (
            URLValidator.is_safe_url("data:text/html,<script>alert(1)</script>")
            is False
        )

    def test_file_url_is_not_safe(self):
        """File URL is not safe."""
        assert URLValidator.is_safe_url("file:///etc/passwd") is False

    def test_empty_url_is_not_safe(self):
        """Empty URL is not safe."""
        assert URLValidator.is_safe_url("") is False

    def test_none_url_is_not_safe(self):
        """None URL is not safe."""
        assert URLValidator.is_safe_url(None) is False

    def test_fragment_only_with_fragments_allowed(self):
        """Fragment-only URL with fragments allowed."""
        assert (
            URLValidator.is_safe_url("#section-1", allow_fragments=True) is True
        )

    def test_fragment_only_with_fragments_disallowed(self):
        """Fragment-only URL with fragments disallowed."""
        assert (
            URLValidator.is_safe_url("#section-1", allow_fragments=False)
            is False
        )

    def test_mailto_allowed(self):
        """Mailto URL when allowed."""
        assert (
            URLValidator.is_safe_url(
                "mailto:test@example.com", allow_mailto=True
            )
            is True
        )

    def test_mailto_not_allowed(self):
        """Mailto URL when not allowed."""
        assert (
            URLValidator.is_safe_url(
                "mailto:test@example.com", allow_mailto=False
            )
            is False
        )

    def test_require_scheme_true(self):
        """URL without scheme when scheme required."""
        assert (
            URLValidator.is_safe_url("example.com/path", require_scheme=True)
            is False
        )

    def test_require_scheme_false(self):
        """URL without scheme when scheme not required."""
        assert (
            URLValidator.is_safe_url("example.com/path", require_scheme=False)
            is True
        )

    def test_ftp_scheme_is_safe(self):
        """FTP scheme is safe."""
        assert (
            URLValidator.is_safe_url("ftp://ftp.example.com/file.txt") is True
        )

    def test_ftps_scheme_is_safe(self):
        """FTPS scheme is safe."""
        assert (
            URLValidator.is_safe_url("ftps://ftp.example.com/file.txt") is True
        )

    def test_unknown_scheme_is_not_safe(self):
        """Unknown scheme is not safe."""
        assert URLValidator.is_safe_url("gopher://gopher.example.com") is False


class TestTrustedDomains:
    """Tests for trusted domain validation."""

    def test_trusted_domain_exact_match(self):
        """Exact domain match is trusted."""
        assert (
            URLValidator.is_safe_url(
                "https://example.com/path", trusted_domains=["example.com"]
            )
            is True
        )

    def test_trusted_domain_subdomain_match(self):
        """Subdomain of trusted domain is trusted."""
        assert (
            URLValidator.is_safe_url(
                "https://api.example.com/path", trusted_domains=["example.com"]
            )
            is True
        )

    def test_untrusted_domain_rejected(self):
        """Untrusted domain is rejected."""
        assert (
            URLValidator.is_safe_url(
                "https://malicious.com/path", trusted_domains=["example.com"]
            )
            is False
        )

    def test_multiple_trusted_domains(self):
        """Multiple trusted domains work correctly."""
        trusted = ["example.com", "trusted.org"]
        assert (
            URLValidator.is_safe_url(
                "https://example.com/", trusted_domains=trusted
            )
            is True
        )
        assert (
            URLValidator.is_safe_url(
                "https://trusted.org/", trusted_domains=trusted
            )
            is True
        )
        assert (
            URLValidator.is_safe_url(
                "https://other.com/", trusted_domains=trusted
            )
            is False
        )


class TestSuspiciousPatterns:
    """Tests for suspicious pattern detection."""

    def test_double_encoding_detected(self):
        """Double URL encoding is detected."""
        # %25 is encoded %
        assert (
            URLValidator.is_safe_url("https://example.com/path%252F..") is False
        )

    def test_null_byte_detected(self):
        """Null byte in URL is detected."""
        assert (
            URLValidator.is_safe_url("https://example.com/path%00.txt") is False
        )

    def test_html_entity_detected(self):
        """HTML entity encoding is detected."""
        assert (
            URLValidator.is_safe_url("https://example.com/path&lt;script&gt;")
            is False
        )


class TestSanitizeUrl:
    """Tests for URLValidator.sanitize_url()."""

    def test_sanitize_adds_https(self):
        """Adds https:// to URL without scheme."""
        result = URLValidator.sanitize_url("example.com/path")
        assert result == "https://example.com/path"

    def test_sanitize_adds_http_when_specified(self):
        """Adds specified default scheme."""
        result = URLValidator.sanitize_url(
            "example.com/path", default_scheme="http"
        )
        assert result == "http://example.com/path"

    def test_sanitize_preserves_existing_scheme(self):
        """Preserves existing scheme."""
        result = URLValidator.sanitize_url("http://example.com/path")
        assert result == "http://example.com/path"

    def test_sanitize_strips_whitespace(self):
        """Strips leading/trailing whitespace."""
        result = URLValidator.sanitize_url("  https://example.com  ")
        assert result == "https://example.com"

    def test_sanitize_rejects_javascript(self):
        """Rejects javascript: URL."""
        result = URLValidator.sanitize_url("javascript:alert('xss')")
        assert result is None

    def test_sanitize_rejects_data(self):
        """Rejects data: URL."""
        result = URLValidator.sanitize_url("data:text/html,<script>x</script>")
        assert result is None

    def test_sanitize_empty_url(self):
        """Returns None for empty URL."""
        result = URLValidator.sanitize_url("")
        assert result is None

    def test_sanitize_none_url(self):
        """Returns None for None URL."""
        result = URLValidator.sanitize_url(None)
        assert result is None


class TestIsAcademicUrl:
    """Tests for URLValidator.is_academic_url()."""

    def test_arxiv_is_academic(self):
        """arXiv URL is academic."""
        assert (
            URLValidator.is_academic_url("https://arxiv.org/abs/2301.12345")
            is True
        )

    def test_pubmed_is_academic(self):
        """PubMed URL is academic."""
        assert (
            URLValidator.is_academic_url(
                "https://pubmed.ncbi.nlm.nih.gov/12345678"
            )
            is True
        )

    def test_doi_is_academic(self):
        """DOI URL is academic."""
        assert (
            URLValidator.is_academic_url("https://doi.org/10.1234/example")
            is True
        )

    def test_biorxiv_is_academic(self):
        """bioRxiv URL is academic."""
        assert (
            URLValidator.is_academic_url("https://biorxiv.org/content/12345")
            is True
        )

    def test_nature_is_academic(self):
        """Nature URL is academic."""
        assert (
            URLValidator.is_academic_url("https://nature.com/articles/12345")
            is True
        )

    def test_subdomain_is_academic(self):
        """Subdomain of academic domain is academic."""
        assert (
            URLValidator.is_academic_url(
                "https://www.nature.com/articles/12345"
            )
            is True
        )

    def test_non_academic_url(self):
        """Non-academic URL returns False."""
        assert (
            URLValidator.is_academic_url("https://example.com/paper") is False
        )

    def test_empty_url_not_academic(self):
        """Empty URL is not academic."""
        assert URLValidator.is_academic_url("") is False

    def test_invalid_url_not_academic(self):
        """Invalid URL is not academic."""
        assert URLValidator.is_academic_url("not a url") is False


class TestExtractDoi:
    """Tests for URLValidator.extract_doi()."""

    def test_extract_doi_from_doi_org(self):
        """Extracts DOI from doi.org URL."""
        result = URLValidator.extract_doi(
            "https://doi.org/10.1234/example.paper"
        )
        assert result == "10.1234/example.paper"

    def test_extract_doi_direct(self):
        """Extracts DOI from direct DOI string."""
        result = URLValidator.extract_doi("10.1234/example.paper")
        assert result == "10.1234/example.paper"

    def test_extract_doi_with_special_chars(self):
        """Extracts DOI with special characters."""
        result = URLValidator.extract_doi(
            "https://doi.org/10.1234/example-paper.v2"
        )
        assert result == "10.1234/example-paper.v2"

    def test_extract_doi_no_match(self):
        """Returns None when no DOI found."""
        result = URLValidator.extract_doi("https://example.com/no-doi-here")
        assert result is None

    def test_extract_doi_empty(self):
        """Returns None for empty string."""
        result = URLValidator.extract_doi("")
        assert result is None


class TestValidateHttpUrl:
    """Tests for URLValidator.validate_http_url()."""

    def test_valid_https_url(self):
        """Valid HTTPS URL passes."""
        assert (
            URLValidator.validate_http_url("https://example.com/callback")
            is True
        )

    def test_valid_http_url(self):
        """Valid HTTP URL passes."""
        assert (
            URLValidator.validate_http_url("http://example.com/callback")
            is True
        )

    def test_missing_scheme_raises(self):
        """URL without scheme raises error."""
        with pytest.raises(URLValidationError) as exc_info:
            URLValidator.validate_http_url("example.com/callback")
        assert "scheme" in str(exc_info.value).lower()

    def test_wrong_scheme_raises(self):
        """Non-HTTP scheme raises error."""
        with pytest.raises(URLValidationError) as exc_info:
            URLValidator.validate_http_url("ftp://example.com/file")
        assert "http" in str(exc_info.value).lower()

    def test_empty_url_raises(self):
        """Empty URL raises error."""
        with pytest.raises(URLValidationError):
            URLValidator.validate_http_url("")

    def test_none_url_raises(self):
        """None URL raises error."""
        with pytest.raises(URLValidationError):
            URLValidator.validate_http_url(None)

    def test_missing_hostname_raises(self):
        """URL without hostname raises error."""
        with pytest.raises(URLValidationError) as exc_info:
            URLValidator.validate_http_url("http:///path")
        assert "hostname" in str(exc_info.value).lower()

    def test_invalid_hostname_raises(self):
        """Invalid hostname raises error."""
        with pytest.raises(URLValidationError):
            URLValidator.validate_http_url("https://.example.com/")

    def test_javascript_url_raises(self):
        """JavaScript URL fails security check."""
        with pytest.raises(URLValidationError):
            URLValidator.validate_http_url("javascript:alert('xss')")


class TestUrlValidatorConstants:
    """Tests for URLValidator constants."""

    def test_unsafe_schemes_complete(self):
        """All expected unsafe schemes are defined."""
        expected = {"javascript", "data", "vbscript", "about", "blob", "file"}
        actual = set(URLValidator.UNSAFE_SCHEMES)
        assert expected == actual

    def test_safe_schemes_complete(self):
        """All expected safe schemes are defined."""
        expected = {"http", "https", "ftp", "ftps"}
        actual = set(URLValidator.SAFE_SCHEMES)
        assert expected == actual

    def test_academic_domains_include_key_sites(self):
        """Key academic domains are included."""
        domains = URLValidator.TRUSTED_ACADEMIC_DOMAINS
        assert "arxiv.org" in domains
        assert "pubmed.ncbi.nlm.nih.gov" in domains
        assert "doi.org" in domains
        assert "nature.com" in domains
